Set ups

rm(list=ls())
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(lubridate)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(ggthemes)

Read in the data

LFPR

data_FemaleLFPR <- read.csv("../data/data_FemaleLFPR.csv", skip=4)
data_LFPR <- read.csv("../data/data_LFPR.csv", skip=4)

Demogrpahic characteristics

data_PropFemale <- read.csv("../data/data_Population%Female.csv", skip=4)
data_PopulationAgeStructure <- read.csv("../data/data_PopulationAgeStructure.csv", skip=0)
data_Population <- read.csv("../data/data_Population.csv", skip=4)
data_FertilityRate <- read.csv("../data/data_FertilityRate.csv", skip=4)
data_TertiaryEducation <- read.csv("../data/data_TertiaryEducation.csv", skip=4)

Economic characteristics

data_GDPPerCapita <- read.csv("../data/data_GDPPerCapita.csv", skip=4)
data_Unemployment <- read.csv("../data/data_Unemployment.csv", skip=4)
data_GiniCoefficient <- read.csv("../data/data_GiniCoefficient.csv", skip=4)
data_Agriculture <- read.csv("../data/data_Agriculture.csv", skip=4)
data_Manufacturing <- read.csv("../data/data_Manufacturing.csv", skip=4)
data_Industry <- read.csv("../data/data_Industry.csv", skip=4)
data_Services <- read.csv("../data/data_Services.csv", skip=4)
data_ServicesEmployment <- read.csv("../data/data_ServiceEmployment.csv", skip=4)

Cultural characteristics

data_ExpenditureonFamily <- read.csv("../data/data_GDP%ExpenditureonFamily.csv", skip=0)
data_ExpenditureonIncapacity <- read.csv("../data/data_GDP%ExpenditureonIncapacity.csv", skip=0)
data_WelfareCoverage <- read.csv("../data/data_Population%Covered.csv", skip=0)
data_GenderPayGap <- read.csv("../data/data_GenderPayGap.csv")
data_MaternityLeave <- read.csv("../data/data_MaternityLeave.csv")
data_PaternityLeave <- read.csv("../data/data_PaternityLeave.csv")
data_GenderEquality <- read.csv("../data/data_GenderEquality.csv")

Tidy data

First, finding non-coutntry datapoints from World Bank data to be filtered out later:

nonCountryDatapointsa <- c("Africa Eastern and Southern",
                           "Africa Western and Central",
                          "Arab World", 
                          "Caribbean small states",
                          "Central Europe and the Baltics",
                          "Early-demographic dividend",
                          "East Asia & Pacific",
                          "East Asia & Pacific (IDA & IBRD countries)",
                          "East Asia & Pacific (excluding high income)",
                          "Euro area",
                          "Europe & Central Asia",
                          "Europe & Central Asia (IDA & IBRD countries)",
                          "Europe & Central Asia (excluding high income)",
                          "European Union",
                          "Fragile and conflict affected situations",
                          "Heavily indebted poor countries (HIPC)",
                          "High income",
                          "IBRD only",
                          "IDA & IBRD total",
                          "IDA blend",
                          "IDA only",
                          "IDA total",
                          "Late-demographic dividend",
                          "Latin America & Caribbean",
                          "Latin America & Caribbean (excluding high income)",
                          "Latin America & Caribbean (excluding high income) LAC",
                          "Latin America & the Caribbean (IDA & IBRD countries)",
                          "Latin America & the Caribbean (IDA & IBRD countries) TLA",
                          "Least developed countries: UN classification",
                          "Low & middle income",
                          "Low income",
                          "Lower middle income",
                          "Middle East & North Africa",
                          "Middle East & North Africa (IDA & IBRD countries)",
                          "Middle East & North Africa (excluding high income)",
                          "Middle income",
                          "North America",
                          "Not classified",
                          "OECD members",
                          "Other small states",
                          "Pacific island small states",
                          "Post-demographic dividend",
                          "Pre-demographic dividend",
                          "Small states",
                          "South Asia",
                          "South Asia (IDA & IBRD)",
                          "Sub-Saharan Africa",
                          "Sub-Saharan Africa (IDA & IBRD countries)",
                          "Sub-Saharan Africa (excluding high income)",
                          "Upper middle income",
                          "World")

Function for cleaning World Bank data:

clean_WB <- function(df, measure) {
  dplyr::filter(df, ! Country.Name %in% nonCountryDatapointsa) %>%
    select(Country.Name, Country.Code, X2020:X2024) %>%
    pivot_longer(
      cols      = X2020:X2024,
      names_to  = "Year",
      values_to = "value"
    ) %>%
    drop_na(value) %>%
    group_by(Country.Name) %>%
    summarise(
      value = last(value),
      Country.Code = last(Country.Code),
      .groups = "drop"
    ) %>%
    rename(!!measure := value)
}

Demogrpahic characteristics

head(data_FemaleLFPR)
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                                                                                    Indicator.Name
## 1 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
## 2 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
## 3 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
## 4 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
## 5 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
## 6 Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)
##      Indicator.Code X1960 X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969
## 1 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 SL.TLF.CACT.FE.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1970 X1971 X1972 X1973 X1974 X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1983 X1984 X1985 X1986 X1987 X1988 X1989    X1990    X1991    X1992    X1993
## 1    NA    NA    NA    NA    NA    NA    NA       NA       NA       NA       NA
## 2    NA    NA    NA    NA    NA    NA    NA 65.53770 65.69234 65.85133 65.93088
## 3    NA    NA    NA    NA    NA    NA    NA 15.78900 15.74000 15.66000 15.54400
## 4    NA    NA    NA    NA    NA    NA    NA 69.94014 69.93172 69.81323 69.77922
## 5    NA    NA    NA    NA    NA    NA    NA 75.60700 75.57200 75.54000 75.51300
## 6    NA    NA    NA    NA    NA    NA    NA 50.84600 53.40700 54.03600 53.20400
##      X1994    X1995    X1996    X1997    X1998    X1999    X2000    X2001
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 66.13733 66.33431 66.32750 66.23818 66.22135 66.23986 66.26010 66.27387
## 3 15.39300 15.22600 15.04400 14.84800 14.65300 14.47100 14.32100 14.22300
## 4 69.73839 69.67772 69.57907 69.49287 69.40707 69.40180 69.35713 69.29550
## 5 75.49200 75.48100 75.48000 75.48600 75.49800 75.51300 75.53100 75.55100
## 6 52.51400 51.52100 50.88000 51.71000 51.08600 50.26600 49.84900 49.35300
##      X2002    X2003    X2004    X2005    X2006    X2007    X2008    X2009
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 66.35391 66.44016 66.47926 66.52541 66.37408 66.13387 65.88168 65.51176
## 3 14.17700 14.17800 14.22800 14.32300 14.47200 14.68600 14.95700 15.25600
## 4 69.15380 69.02790 68.90672 68.76926 68.63647 68.54642 68.31940 68.07177
## 5 75.57000 75.58900 75.60600 75.61800 75.62600 75.63100 75.63100 75.62600
## 6 49.08800 48.46100 47.84500 47.24200 46.65200 46.07600 45.51300 45.95500
##      X2010    X2011    X2012    X2013    X2014    X2015    X2016    X2017
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 64.77445 65.32562 65.16964 65.12264 64.16107 63.71232 63.12078 62.39547
## 3 15.57600 15.89500 16.20800 17.13800 18.10700 19.11300 20.15600 21.24000
## 4 67.82694 67.69748 67.49655 67.14115 66.78181 66.46139 66.23346 65.72373
## 5 75.62100 74.86500 74.63300 74.40100 74.16700 73.93300 73.69800 73.46100
## 6 46.83500 52.56100 48.93700 43.78300 43.95800 47.11600 49.88800 49.73600
##      X2018    X2019    X2020    X2021    X2022    X2023    X2024  X
## 1       NA       NA       NA       NA       NA       NA       NA NA
## 2 61.84047 61.35699 59.77301 60.27955 60.44514 64.07521 63.90776 NA
## 3 19.80800 18.30400 16.47300 14.66600  5.15900  5.15500  5.10400 NA
## 4 65.42507 65.17822 64.43911 64.78000 64.73852 66.34934 66.18173 NA
## 5 73.22400 72.98500 72.73100 74.69500 73.17900 73.22400 73.14000 NA
## 6 51.40700 52.93000 50.61600 51.88700 53.18300 53.20700 53.51200 NA
head(data_LFPR)
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                                                                                  Indicator.Name
## 1 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
## 2 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
## 3 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
## 4 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
## 5 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
## 6 Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate)
##   Indicator.Code X1960 X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969
## 1 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 SL.TLF.CACT.ZS    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1970 X1971 X1972 X1973 X1974 X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1983 X1984 X1985 X1986 X1987 X1988 X1989    X1990    X1991    X1992    X1993
## 1    NA    NA    NA    NA    NA    NA    NA       NA       NA       NA       NA
## 2    NA    NA    NA    NA    NA    NA    NA 72.11155 72.17933 72.27927 72.27015
## 3    NA    NA    NA    NA    NA    NA    NA 47.25100 47.19800 47.13700 47.06700
## 4    NA    NA    NA    NA    NA    NA    NA 76.59380 76.61586 76.51132 76.54951
## 5    NA    NA    NA    NA    NA    NA    NA 77.27100 77.31900 77.36400 77.40400
## 6    NA    NA    NA    NA    NA    NA    NA 61.78300 64.29900 64.88300 64.05700
##      X1994    X1995    X1996    X1997    X1998    X1999    X2000    X2001
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 72.35346 72.49353 72.52087 72.48101 72.41696 72.33893 72.28231 72.22234
## 3 46.98600 46.90500 46.82700 46.75100 46.68200 46.62100 46.57100 46.53300
## 4 76.56316 76.53043 76.41852 76.33737 76.26321 76.23509 76.11999 75.98527
## 5 77.43900 77.45600 77.45600 77.44700 77.43000 77.41100 77.38900 77.36400
## 6 63.35800 62.34300 61.66300 62.46400 61.79500 60.91300 60.43500 59.88800
##      X2002    X2003    X2004    X2005    X2006    X2007    X2008    X2009
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 72.20879 72.20830 72.16941 72.14703 71.98197 71.76300 71.53610 71.14870
## 3 46.50800 46.49900 46.50700 46.53200 46.57000 46.62100 46.68000 46.74300
## 4 75.68472 75.49284 75.29329 75.10964 74.93536 74.73455 74.43295 74.09511
## 5 77.33900 77.31300 77.29000 77.27200 77.26000 77.25200 77.25200 77.25600
## 6 59.61000 58.55700 57.49600 56.42800 55.35400 54.27500 53.19200 54.99300
##      X2010    X2011    X2012    X2013    X2014    X2015    X2016    X2017
## 1       NA       NA       NA       NA       NA       NA       NA       NA
## 2 70.46491 70.92204 70.87421 70.83214 69.95276 69.62207 69.13831 68.50009
## 3 46.81100 46.88100 46.95600 47.02600 47.09600 47.16500 47.23500 47.30500
## 4 73.76231 73.53483 73.25900 72.90333 72.53744 72.27095 72.21715 71.75875
## 5 77.26000 77.31500 77.03700 76.75600 76.47300 76.18700 75.89900 75.60900
## 6 55.20200 59.93800 56.99500 52.41800 53.42000 55.49700 57.31400 58.05700
##      X2018    X2019    X2020    X2021    X2022    X2023    X2024  X
## 1       NA       NA       NA       NA       NA       NA       NA NA
## 2 68.08854 67.74673 66.29093 66.79551 67.00764 69.30694 69.18241 NA
## 3 45.57000 43.82300 41.57900 40.93200 37.64000 37.67300 37.49600 NA
## 4 71.53272 71.32954 70.80324 70.90759 70.91841 71.41612 71.28294 NA
## 5 75.31600 75.02100 74.72900 76.37200 75.61500 75.75000 75.70300 NA
## 6 59.29600 60.30600 57.78000 58.86900 60.42100 60.43200 60.67500 NA
data_FemaleLFPR_tidy <- clean_WB(data_FemaleLFPR, "Female.LFPR")
data_LFPR_tidy <- clean_WB(data_LFPR, "LFPR")
head(data_FemaleLFPR_tidy)
## # A tibble: 6 × 3
##   Country.Name Female.LFPR Country.Code
##   <chr>              <dbl> <chr>       
## 1 Afghanistan         5.10 AFG         
## 2 Albania            53.5  ALB         
## 3 Algeria            14.0  DZA         
## 4 Angola             73.1  AGO         
## 5 Argentina          52.9  ARG         
## 6 Armenia            56.4  ARM
head(data_PropFemale)
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                               Indicator.Name    Indicator.Code    X1960
## 1 Population, female (% of total population) SP.POP.TOTL.FE.ZS 50.84848
## 2 Population, female (% of total population) SP.POP.TOTL.FE.ZS 50.48167
## 3 Population, female (% of total population) SP.POP.TOTL.FE.ZS 48.08829
## 4 Population, female (% of total population) SP.POP.TOTL.FE.ZS 50.36868
## 5 Population, female (% of total population) SP.POP.TOTL.FE.ZS 49.36403
## 6 Population, female (% of total population) SP.POP.TOTL.FE.ZS 49.63153
##      X1961    X1962    X1963    X1964    X1965    X1966    X1967    X1968
## 1 50.83395 50.83097 50.83069 50.82525 50.82660 50.83825 50.84082 50.83716
## 2 50.48439 50.48799 50.49292 50.49852 50.50500 50.51271 50.51805 50.52027
## 3 48.16973 48.24648 48.31892 48.38751 48.45271 48.51496 48.57419 48.62915
## 4 50.39844 50.42297 50.43892 50.45402 50.46745 50.47840 50.49145 50.50495
## 5 49.24822 49.17658 49.10501 49.03395 48.96378 48.89516 48.82515 48.75194
## 6 49.61255 49.57760 49.54491 49.51565 49.49047 49.46999 49.45644 49.45469
##      X1969    X1970    X1971    X1972    X1973    X1974    X1975    X1976
## 1 50.83994 50.84648 50.86084 50.88378 50.91864 50.95473 50.98665 51.03230
## 2 50.52144 50.52783 50.53253 50.53259 50.52867 50.52338 50.52177 50.52395
## 3 48.67991 48.72774 48.77298 48.81549 48.85468 48.89056 48.92343 48.95351
## 4 50.51338 50.51154 50.49962 50.48708 50.47719 50.46908 50.46391 50.45860
## 5 48.67509 48.70903 48.85272 48.99319 49.12712 49.25198 49.37374 49.49393
## 6 49.44856 49.42667 49.40089 49.37683 49.35579 49.33779 49.32308 49.31141
##      X1977    X1978    X1979    X1980    X1981    X1982    X1983    X1984
## 1 51.09413 51.15201 51.20927 51.26292 51.31129 51.31917 51.28851 51.24941
## 2 50.53077 50.54149 50.55006 50.52845 50.51003 50.51416 50.50989 50.51295
## 3 48.98069 49.01610 49.07308 49.14256 49.21562 49.31879 49.43961 49.60103
## 4 50.45299 50.44948 50.44576 50.43786 50.42582 50.41522 50.40657 50.39124
## 5 49.60769 49.71315 49.81089 49.90154 49.98563 50.06350 50.14279 50.22306
## 6 49.30178 49.29769 49.30832 49.32752 49.34743 49.37306 49.40450 49.44087
##      X1985    X1986    X1987    X1988    X1989    X1990    X1991    X1992
## 1 51.19223 51.12383 51.04088 50.94772 50.89842 50.92887 51.04819 51.16519
## 2 50.50629 50.48600 50.46638 50.44993 50.44242 50.43587 50.43400 50.48089
## 3 49.79831 49.93787 50.02022 50.07163 50.08772 50.09682 50.10532 50.11505
## 4 50.37116 50.35338 50.33514 50.31360 50.29017 50.26808 50.24906 50.22856
## 5 50.29688 50.36424 50.43869 50.51637 50.58429 50.64631 50.69250 50.73150
## 6 49.48208 49.52579 49.56762 49.60832 49.67099 49.76549 49.87355 49.98902
##      X1993    X1994    X1995    X1996    X1997    X1998    X1999    X2000
## 1 51.23609 51.30823 51.40942 51.53156 51.63796 51.73496 51.82602 51.88878
## 2 50.42971 50.46689 50.63644 50.76851 50.85067 50.84193 50.83800 50.83103
## 3 50.09946 50.07072 50.04250 50.00597 49.96733 49.93187 49.89475 49.85354
## 4 50.20727 50.18972 50.17463 50.16033 50.14760 50.13005 50.10750 50.08628
## 5 50.79308 50.86074 50.88272 50.87011 50.85717 50.85390 50.86738 50.87913
## 6 50.11363 50.24616 50.38601 50.53274 50.68744 50.84856 51.01128 51.17801
##      X2001    X2002    X2003    X2004    X2005    X2006    X2007    X2008
## 1 51.94065 51.98787 52.01980 52.05175 52.09700 52.14743 52.19647 52.24375
## 2 50.81932 50.80429 50.78521 50.76560 50.74651 50.72599 50.70553 50.68732
## 3 49.81444 49.77482 49.73536 49.69777 49.66246 49.63251 49.60850 49.58623
## 4 50.06633 50.04202 50.01321 49.98585 49.96319 49.94412 49.92527 49.90073
## 5 50.88612 50.88872 50.87450 50.85106 50.82651 50.80062 50.77388 50.74781
## 6 51.24742 51.18176 51.08517 50.99288 50.90592 50.82527 50.75016 50.67827
##      X2009    X2010    X2011    X2012    X2013    X2014    X2015    X2016
## 1 52.28882 52.27940 52.28857 52.36481 52.43816 52.50597 52.56565 52.62060
## 2 50.67264 50.65775 50.64105 50.62561 50.60957 50.59146 50.56575 50.55426
## 3 49.56271 49.53944 49.51849 49.49977 49.48265 49.46812 49.45720 49.44955
## 4 49.87676 49.85371 49.82468 49.79561 49.76611 49.74448 49.72998 49.71410
## 5 50.72268 50.69876 50.67764 50.65990 50.64424 50.63001 50.61682 50.60359
## 6 50.60831 50.54083 50.51035 50.51471 50.51708 50.51681 50.51462 50.51069
##      X2017    X2018    X2019    X2020    X2021    X2022    X2023 X2024  X
## 1 52.67379 52.73132 52.79614 52.84451 52.83196 52.79267 52.76962    NA NA
## 2 50.55150 50.54016 50.53181 50.52512 50.51942 50.51171 50.50284    NA NA
## 3 49.45492 49.47445 49.49391 49.51122 49.53438 49.54101 49.52551    NA NA
## 4 49.69888 49.68481 49.67276 49.66354 49.65476 49.64911 49.64667    NA NA
## 5 50.59035 50.57764 50.56543 50.55582 50.54686 50.53626 50.52589    NA NA
## 6 50.50650 50.50253 50.49691 50.50732 50.53210 50.54948 50.55923    NA NA
head(data_PopulationAgeStructure)
##   flagCode  country AgeStructure_PctAt0To14_pct_2024
## 1       NE    Niger                             49.5
## 2       UG   Uganda                             47.0
## 3       AO   Angola                             46.9
## 4       ML     Mali                             46.8
## 5       TD     Chad                             45.8
## 6       CD DR Congo                             45.7
##   AgeStructure_PctAt15To64_pct_2024 AgeStructure_PctAt65Plus_pct_2024
## 1                              47.8                               2.7
## 2                              50.6                               2.4
## 3                              50.7                               2.4
## 4                              50.1                               3.1
## 5                              51.7                               2.5
## 6                              51.8                               2.5
head(data_Population)
##                  Country.Name Country.Code    Indicator.Name Indicator.Code
## 1                       Aruba          ABW Population, total    SP.POP.TOTL
## 2 Africa Eastern and Southern          AFE Population, total    SP.POP.TOTL
## 3                 Afghanistan          AFG Population, total    SP.POP.TOTL
## 4  Africa Western and Central          AFW Population, total    SP.POP.TOTL
## 5                      Angola          AGO Population, total    SP.POP.TOTL
## 6                     Albania          ALB Population, total    SP.POP.TOTL
##       X1960     X1961     X1962     X1963     X1964     X1965     X1966
## 1     54922     55578     56320     57002     57619     58190     58694
## 2 130072080 133534923 137171659 140945536 144904094 149033472 153281203
## 3   9035043   9214083   9404406   9604487   9814318  10036008  10266395
## 4  97630925  99706674 101854756 104089175 106388440 108772632 111246953
## 5   5231654   5301583   5354310   5408320   5464187   5521981   5581386
## 6   1608800   1659800   1711319   1762621   1814135   1864791   1914573
##       X1967     X1968     X1969     X1970     X1971     X1972     X1973
## 1     58990     59069     59052     58950     58781     58047     58299
## 2 157704381 162329396 167088245 171984985 177022314 182126556 187524135
## 3  10505959  10756922  11017409  11290128  11567667  11853696  12157999
## 4 113795019 116444636 119203521 122086536 125072948 128176494 131449942
## 5   5641807   5702699   5763685   5852788   5991102   6174262   6388528
## 6   1965598   2022272   2081695   2135479   2187853   2243126   2296752
##       X1974     X1975     X1976     X1977     X1978     X1979     X1980
## 1     58349     58295     58368     58580     58776     59191     59909
## 2 193186642 198914573 204802976 210680842 217074286 223974122 230792729
## 3  12469127  12773954  13059851  13340756  13611441  13655567  13169311
## 4 134911581 138569918 142337272 146258576 150402616 154721711 159166518
## 5   6613367   6842947   7074664   7317829   7576734   7847207   8133872
## 6   2350124   2404831   2458526   2513546   2566266   2617832   2671997
##       X1981     X1982     X1983     X1984     X1985     X1986     X1987
## 1     60563     61276     62228     62901     61728     59931     59159
## 2 238043099 245822010 253644643 261458202 269450407 277621771 286067346
## 3  11937581  10991378  10917982  11190221  11426852  11420074  11387818
## 4 163762473 168585118 173255157 177880746 182811038 187889141 193104347
## 5   8435607   8751648   9082983   9425917   9779120  10139450  10497858
## 6   2726056   2784278   2843960   2904429   2964762   3022635   3083605
##       X1988     X1989     X1990     X1991     X1992     X1993     X1994
## 1     59331     60443     62753     65896     69005     73685     77595
## 2 294498625 302939121 311748681 320442961 329082707 338324002 347441809
## 3  11523298  11874088  12045660  12238879  13278974  14943172  16250794
## 4 198485027 204062274 209566031 215178709 221191375 227246778 233360104
## 5  10861291  11238562  11626360  12023529  12423712  12827135  13249764
## 6   3142336   3227943   3286542   3266790   3247039   3227287   3207536
##       X1995     X1996     X1997     X1998     X1999     X2000     X2001
## 1     79805     83021     86301     88451     89659     90588     91439
## 2 356580375 366138524 375646235 385505757 395750933 406156661 416807868
## 3  17065836  17763266  18452091  19159996  19887785  20130327  20284307
## 4 239801875 246415446 253207584 260297834 267506298 274968446 282780717
## 5  13699778  14170973  14660413  15159370  15667235  16194869  16747208
## 6   3187784   3168033   3148281   3128530   3108778   3089027   3060173
##       X2002     X2003     X2004     X2005     X2006     X2007     X2008
## 1     92074     93128     95138     97635     99405    100150    100917
## 2 427820358 439173286 450928044 463076637 475606210 488580707 502070763
## 3  21378117  22733049  23560654  24404567  25424094  25909852  26482622
## 4 290841795 299142845 307725100 316588476 325663158 334984176 344586109
## 5  17327699  17943712  18600423  19291161  20015279  20778561  21578655
## 6   3051010   3039616   3026939   3011487   2992547   2970017   2947314
##       X2009     X2010     X2011     X2012     X2013     X2014     X2015
## 1    101604    101838    102591    104110    105675    106807    107906
## 2 516003448 530308387 544737983 559609961 575202699 590968990 607123269
## 3  27466101  28284089  29347708  30560034  31622704  32792523  33831764
## 4 354343844 364358270 374790143 385360349 396030207 406992047 418127845
## 5  22414773  23294825  24218352  25177394  26165620  27160769  28157798
## 6   2927519   2913021   2905195   2900401   2895092   2889104   2880703
##       X2016     X2017     X2018     X2019     X2020     X2021     X2022
## 1    108727    108735    108908    109203    108587    107700    107310
## 2 623369401 640058741 657801085 675950189 694446100 713090928 731821393
## 3  34700612  35688935  36743039  37856121  39068979  40000412  40578842
## 4 429454743 440882906 452195915 463365429 474569351 485920997 497387180
## 5  29183070  30234839  31297155  32375632  33451132  34532429  35635029
## 6   2876101   2873457   2866376   2854191   2837849   2811666   2777689
##       X2023 X2024  X
## 1    107359    NA NA
## 2 750503764    NA NA
## 3  41454761    NA NA
## 4 509398589    NA NA
## 5  36749906    NA NA
## 6   2745972    NA NA
head(data_FertilityRate)
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                             Indicator.Name Indicator.Code    X1960    X1961
## 1 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 4.567000 4.422000
## 2 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 6.650330 6.667308
## 3 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 7.282000 7.284000
## 4 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 6.468887 6.478351
## 5 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 6.708000 6.790000
## 6 Fertility rate, total (births per woman) SP.DYN.TFRT.IN 6.383000 6.273000
##      X1962    X1963    X1964    X1965    X1966    X1967    X1968    X1969
## 1 4.262000 4.107000 3.940000 3.797000 3.621000 3.452000 3.277000 3.111000
## 2 6.688246 6.709226 6.724930 6.737459 6.766486 6.775493 6.782523 6.782813
## 3 7.292000 7.302000 7.304000 7.305000 7.320000 7.339000 7.363000 7.389000
## 4 6.492277 6.500229 6.516739 6.532766 6.556550 6.582717 6.607297 6.633569
## 5 6.872000 6.954000 7.036000 7.116000 7.194000 7.267000 7.332000 7.388000
## 6 6.106000 5.927000 5.714000 5.474000 5.325000 5.310000 5.317000 5.287000
##      X1970    X1971    X1972    X1973    X1974    X1975    X1976    X1977
## 1 2.973000 2.862000 2.755000 2.655000 2.573000 2.499000 2.432000 2.372000
## 2 6.792709 6.798998 6.798352 6.796038 6.792972 6.783362 6.768292 6.749405
## 3 7.400000 7.432000 7.453000 7.487000 7.526000 7.542000 7.561000 7.591000
## 4 6.655836 6.697323 6.733532 6.764148 6.802286 6.839113 6.858526 6.890310
## 5 7.434000 7.467000 7.488000 7.498000 7.500000 7.494000 7.485000 7.475000
## 6 5.158000 5.083000 4.979000 4.829000 4.700000 4.566000 4.374000 4.204000
##      X1978    X1979    X1980    X1981    X1982    X1983    X1984    X1985
## 1 2.312000 2.257000 2.203000 2.161000 2.142000 2.148000 2.170000 2.188000
## 2 6.733325 6.715178 6.694612 6.663769 6.635449 6.595348 6.549886 6.500241
## 3 7.599000 7.612000 7.643000 7.617000 7.600000 7.570000 7.554000 7.550000
## 4 6.916096 6.904869 6.888044 6.871587 6.846727 6.819426 6.770109 6.717738
## 5 7.467000 7.461000 7.459000 7.459000 7.461000 7.462000 7.459000 7.451000
## 6 3.996000 3.782000 3.590000 3.468000 3.416000 3.319000 3.282000 3.214000
##      X1986    X1987    X1988    X1989    X1990    X1991    X1992    X1993
## 1 2.212000 2.243000 2.273000 2.306000 2.345000 2.362000 2.353000 2.331000
## 2 6.448271 6.398680 6.329041 6.249929 6.160212 6.094846 6.028219 5.970246
## 3 7.553000 7.548000 7.551000 7.559000 7.576000 7.631000 7.703000 7.761000
## 4 6.676484 6.629947 6.594412 6.562254 6.516886 6.470298 6.417682 6.362002
## 5 7.435000 7.409000 7.373000 7.328000 7.272000 7.208000 7.138000 7.065000
## 6 3.141000 3.106000 3.069000 3.030000 3.014000 2.956000 2.885000 2.811000
##      X1994    X1995    X1996    X1997    X1998    X1999    X2000    X2001
## 1 2.298000 2.288000 2.233000 2.138000 2.001000 1.897000 1.845000 1.813000
## 2 5.911238 5.853480 5.781580 5.710820 5.662886 5.611619 5.549893 5.501766
## 3 7.767000 7.767000 7.757000 7.732000 7.693000 7.641000 7.566000 7.453000
## 4 6.299172 6.236263 6.171442 6.103496 6.044794 6.027932 6.022143 5.990074
## 5 6.990000 6.918000 6.851000 6.789000 6.732000 6.683000 6.639000 6.601000
## 6 2.798000 2.762000 2.666000 2.535000 2.425000 2.313000 2.217000 2.141000
##      X2002    X2003    X2004    X2005    X2006    X2007    X2008    X2009
## 1 1.800000 1.808000 1.819000 1.844000 1.862000 1.881000 1.889000 1.875000
## 2 5.449696 5.407328 5.381308 5.350399 5.305920 5.253985 5.219403 5.142535
## 3 7.320000 7.174000 7.018000 6.858000 6.686000 6.508000 6.392000 6.295000
## 4 5.956202 5.923573 5.895514 5.871770 5.846028 5.809651 5.785473 5.751120
## 5 6.567000 6.533000 6.499000 6.461000 6.419000 6.372000 6.320000 6.260000
## 6 2.002000 2.006000 1.889000 1.787000 1.686000 1.633000 1.616000 1.638000
##      X2010    X2011    X2012    X2013    X2014    X2015    X2016    X2017
## 1 1.855000 1.858000 1.907000 1.944000 1.944000 1.899000 1.848000 1.785000
## 2 5.066497 4.981016 4.899585 4.819563 4.748074 4.676508 4.615744 4.569884
## 3 6.195000 6.094000 5.985000 5.879000 5.770000 5.652000 5.542000 5.433000
## 4 5.712896 5.665667 5.596800 5.515468 5.440018 5.346596 5.228976 5.098885
## 5 6.194000 6.120000 6.039000 5.953000 5.864000 5.774000 5.686000 5.600000
## 6 1.653000 1.677000 1.719000 1.741000 1.721000 1.631000 1.555000 1.486000
##      X2018    X2019    X2020    X2021    X2022    X2023 X2024  X
## 1 1.732000 1.701000 1.662000 1.631000 1.615000 1.602000    NA NA
## 2 4.521443 4.471338 4.412973 4.350683 4.287033 4.223771    NA NA
## 3 5.327000 5.238000 5.145000 5.039000 4.932000 4.840000    NA NA
## 4 4.962571 4.829134 4.707399 4.637741 4.563354 4.497707    NA NA
## 5 5.519000 5.442000 5.371000 5.304000 5.209000 5.124000    NA NA
## 6 1.415000 1.395000 1.371000 1.365000 1.355000 1.348000    NA NA
data_Population_tidy <- clean_WB(data_Population, "Population")
data_FertilityRate_tidy <- clean_WB(data_FertilityRate, "Fertility.Rate")
data_TertiaryEducation_tiday <- clean_WB(data_TertiaryEducation, "Tertiary.Education")
data_PropFemale_tidy <- clean_WB(data_PropFemale, "Prop.Female")
PAS_temp <- data_PopulationAgeStructure %>%
  rename(X0to14_2024 = AgeStructure_PctAt0To14_pct_2024,
         X15to65_2024 = AgeStructure_PctAt15To64_pct_2024,
         X65plus_2024 = AgeStructure_PctAt65Plus_pct_2024) %>%
  mutate(country =  recode(country,
                          "DR Congo" = "Congo, Dem. Rep.",
                          "Republic of the Congo" = "Congo, Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Syria" = "Syrian Arab Republic",
                          "Palestine" = "West Bank and Gaza",
                          "Ivory Coast" = "Cote d'Ivoire",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Cape Verde" = "Cabo Verde",
                          "Laos" = "Lao PDR",
                          "Iran" = "Iran, Islamic Rep.",
                          "Bahamas" = "Bahamas, The",
                          "Brunei" = "Brunei Darussalam",
                          "South Korea" = "Korea, Rep.",
                          "Vietnam" = "Viet Nam",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Sint Maarten" = "Sint Maarten (Dutch part)",
                          "Turkey" = "Turkiye",
                          "Czech Republic" = "Czechia",
                          "Macau" = "Macao SAR, China",
                          "Russia" = "Russian Federation",
                          "Hong Kong" = "Hong Kong SAR, China",
                          "Slovakia" = "Slovak Republic",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Micronesia" = "Micronesia, Fed. Sts.",
                          "North Korea" = "Korea, Dem. People's Rep.",
                          "Venezuela" = "Venezuela, RB",
                          "Yemen" = "Yemen, Rep.",
                          "Saint Martin" = "St. Martin (French part)"))
data_PopulationAgeStructure_tidy <- PAS_temp %>%
  add_row(flagCode = NA, country = "Channel Islands",
          # weighted average of Jersey and Guernsey
          # both populations are from World Population Review
          X0to14_2024 = (PAS_temp[PAS_temp$country=="Jersey", "X0to14_2024"] * 103989 +
                        PAS_temp[PAS_temp$country=="Guernsey", "X0to14_2024"] * 64477) / 168466,
          X15to65_2024 = (PAS_temp[PAS_temp$country=="Jersey", "X15to65_2024"] * 103989 +
                        PAS_temp[PAS_temp$country=="Guernsey", "X15to65_2024"] * 64477) / 168466,
          X65plus_2024 = (PAS_temp[PAS_temp$country=="Jersey", "X65plus_2024"] * 103989 +
                        PAS_temp[PAS_temp$country=="Guernsey", "X65plus_2024"] * 64477) / 168466) %>%
  filter(! country %in% c("Jersey", "Guernsey"))

Economic characteristics

head(data_GDPPerCapita)
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                                  Indicator.Name    Indicator.Code X1960 X1961
## 1 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
## 2 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
## 3 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
## 4 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
## 5 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
## 6 GDP per capita, PPP (current international $) NY.GDP.PCAP.PP.CD    NA    NA
##   X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969 X1970 X1971 X1972 X1973 X1974
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982 X1983 X1984 X1985 X1986 X1987
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1988 X1989     X1990     X1991     X1992     X1993     X1994     X1995
## 1    NA    NA 21732.858 23099.940 23889.045 24575.661 25791.043 26254.743
## 2    NA    NA  1804.850  1817.905  1775.858  1766.045  1788.170  1858.436
## 3    NA    NA        NA        NA        NA        NA        NA        NA
## 4    NA    NA  1727.098  1759.836  1791.207  1764.645  1753.347  1778.289
## 5    NA    NA  3340.598  3372.597  3143.441  2369.242  2374.021  2695.806
## 6    NA    NA  2549.242  1908.942  1823.142  2057.284  2289.685  2665.542
##       X1996     X1997     X1998     X1999      X2000     X2001      X2002
## 1 26004.496 27240.803 27412.755 27765.653 30245.7070 31920.239 31888.5087
## 2  1944.803  2011.694  2019.400  2052.057  2116.2394  2186.630  2249.0415
## 3        NA        NA        NA        NA   813.5503   747.688   926.5079
## 4  1844.819  1908.391  1946.772  1953.486  2016.0918  2108.608  2278.4623
## 5  3013.333  3178.495  3254.244  3262.976  3326.7799  3427.854  3824.2848
## 6  2979.809  2717.129  3021.036  3471.650  3861.2959  4300.829  4661.3865
##       X2003      X2004     X2005     X2006     X2007     X2008     X2009
## 1 32507.084 35059.2731 35098.798 35937.595 37768.566 38904.999 34339.939
## 2  2308.118  2438.8993  2602.006  2785.792  2971.975  3082.151  3041.089
## 3   966.962   971.6335  1076.087  1121.834  1286.950  1333.747  1570.698
## 4  2383.607  2567.0755  2722.787  2873.342  3024.431  3185.419  3305.684
## 5  3878.508  4262.9060  4876.288  5404.384  6095.999  6651.378  6498.169
## 6  5000.329  5427.8809  5865.322  6566.515  7285.035  8228.343  8812.729
##       X2010     X2011     X2012     X2013     X2014     X2015     X2016
## 1 33729.512 35324.072 34095.647 35901.653 35657.287 35972.866 36117.508
## 2  3149.020  3253.724  3171.570  3290.844  3426.576  3456.365  3551.069
## 3  1765.538  1744.061  1988.429  2133.241  2224.491  2284.076  2213.181
## 4  3477.320  3620.650  3735.925  3899.646  4103.766  4052.734  3996.864
## 5  6607.022  6711.390  7354.943  7561.039  7990.274  7119.726  6843.736
## 6  9627.114 10207.726 10526.242 10570.989 11259.240 11662.036 12078.859
##       X2017     X2018     X2019     X2020     X2021     X2022     X2023 X2024
## 1 37524.928 39287.020 39110.276 28976.464 35696.309 41649.451 44967.345    NA
## 2  3703.887  3648.311  3742.849  3629.508  3907.952  4229.682  4374.230    NA
## 3  2335.796  2432.277  2583.485  2561.982  2144.167  2122.996  2211.281    NA
## 4  4063.185  4198.081  4454.156  4441.937  4698.927  5107.273  5343.469    NA
## 5  6992.729  7347.800  7528.382  6450.750  7408.127  7924.889  8040.702    NA
## 6 12771.034 13696.789 14792.257 14511.984 16127.753 19446.237 21263.196    NA
##    X
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
head(data_GiniCoefficient)
##                  Country.Name Country.Code Indicator.Name Indicator.Code X1960
## 1                       Aruba          ABW     Gini index    SI.POV.GINI    NA
## 2 Africa Eastern and Southern          AFE     Gini index    SI.POV.GINI    NA
## 3                 Afghanistan          AFG     Gini index    SI.POV.GINI    NA
## 4  Africa Western and Central          AFW     Gini index    SI.POV.GINI    NA
## 5                      Angola          AGO     Gini index    SI.POV.GINI    NA
## 6                     Albania          ALB     Gini index    SI.POV.GINI    NA
##   X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969 X1970 X1971 X1972 X1973
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1974 X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982 X1983 X1984 X1985 X1986
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##   X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995 X1996 X1997 X1998 X1999
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6    NA    NA    NA    NA    NA    NA    NA    NA    NA    27    NA    NA    NA
##   X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011 X2012
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 5  51.9    NA    NA    NA    NA    NA    NA    NA  42.7    NA    NA    NA    NA
## 6    NA    NA  31.7    NA    NA  30.6    NA    NA  30.0    NA    NA    NA    29
##   X2013 X2014 X2015 X2016 X2017 X2018 X2019 X2020 X2021 X2022 X2023 X2024  X
## 1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA NA
## 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA NA
## 3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA NA
## 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA NA
## 5    NA    NA    NA    NA    NA  51.3    NA    NA    NA    NA    NA    NA NA
## 6    NA  34.6  32.8  33.7  33.1  30.1  30.1  29.4    NA    NA    NA    NA NA
data_GDPPerCapita_tidy <- clean_WB(data_GDPPerCapita, "GDP.PC")
data_GiniCoefficient_tidy <- clean_WB(data_GiniCoefficient, "Gini.Coefficient")
data_Unemployment_tidy <- clean_WB(data_Unemployment, "Unemployment")
data_ServicesEmployment_tidy <- clean_WB(data_ServicesEmployment, "Services.Employment")
data_Agriculture_tidy <- clean_WB(data_Agriculture, "Agri")
data_Manufacturing_tidy <- clean_WB(data_Manufacturing, "Manu")
data_Industry_tidy <- clean_WB(data_Industry, "Indus")
data_Services_tidy <- clean_WB(data_Services, "Serv")

data_Sectors <- data_Agriculture_tidy %>%
  full_join(data_Manufacturing_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_Industry_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_Services_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  mutate(Agri = replace_na(Agri, 0),
         Manu = replace_na(Manu, 0),
         Indus = replace_na(Indus, 0),
         Serv = replace_na(Serv, 0))

sectors_SG <- data_Sectors %>%
  filter(Country.Name == "Singapore") %>%
  pivot_longer(c(Agri, Manu, Indus, Serv), names_to = "Sector", values_to = "Prop") %>%
  pull(Prop)

sectors_all <- data.matrix(
  data_Sectors %>%
    select(Agri, Manu, Indus, Serv)
)

sectors_similarities <- sectors_all %*% sectors_SG

data_Sectors_tidy <- cbind(data_Sectors, Sector.Similarity = sectors_similarities ^ 0.5)

Cultural characteristics

nonCountryDatepoints2 <- c("APEC", "ASEAN",
                           "Africa",
                           "Africa: Low income",
                           "Africa: Lower-middle income",
                           "Africa: Upper-middle income",
                           "Americas",
                           "Americas: High income",
                           "Americas: Lower-middle income",
                           "Americas: Upper-middle income",
                           "Arab League",
                           "Arab States",
                           "Arab States: High income",
                           "Arab States: Low income",
                           "Arab States: Lower-middle income",
                           "Arab States: Upper-middle income",
                           "Asia and the Pacific",
                           "Asia and the Pacific: High income",
                           "Asia and the Pacific: Low income",
                           "Asia and the Pacific: Lower-middle income",
                           "Asia and the Pacific: Upper-middle income",
                           "BRICS", "CARICOM", "Caribbean",
                           "Central Africa",
                           "Central America",
                           "Central Asia",
                           "Central and Western Asia",
                           "Central and Western Asia: High income",
                           "Central and Western Asia: Lower-middle income",
                           "Central and Western Asia: Upper-middle income",
                           "Eastern Africa",
                           "Eastern Asia",
                           "Eastern Asia: High income",
                           "Eastern Europe",
                           "Eastern Europe: High income",
                           "Eastern Europe: Upper-middle income",
                           "Europe and Central Asia",
                           "Europe and Central Asia: High income",
                           "Europe and Central Asia: Lower-middle income",
                           "Europe and Central Asia: Upper-middle income",
                           "European Union 27", "European Union 28",
                           "G20", "G7",
                           "Latin America and the Caribbean",
                           "Latin America and the Caribbean: High income",
                           "Latin America and the Caribbean: Lower-middle income",
                           "Latin America and the Caribbean: Upper-middle income",
                           "MENA",
                           "Northern Africa",
                           "Northern Africa: Lower-middle income",
                           "Northern America",
                           "Northern America: High income",
                           "Northern Europe",
                           "Northern, Southern and Western Europe",
                           "Northern, Southern and Western Europe: High income",
                           "Northern, Southern and Western Europe: Upper-middle income",
                           "Pacific Islands",
                           "South America",
                           "South-Eastern Asia",
                           "South-Eastern Asia and the Pacific",
                           "South-Eastern Asia and the Pacific: High income",
                           "South-Eastern Asia and the Pacific: Lower-middle income",
                           "South-Eastern Asia and the Pacific: Upper-middle income",
                           "Southern Africa",
                           "Southern Asia",
                           "Southern Asia: Lower-middle income",
                           "Southern Europe",
                           "Sub-Saharan Africa",
                           "Sub-Saharan Africa: Low income",
                           "Sub-Saharan Africa: Lower-middle income",
                           "Sub-Saharan Africa: Upper-middle income",
                           "Western Africa",
                           "Western Asia",
                           "Western Europe",
                           "World",
                           "World excluding BRICS",
                           "World excluding India and China",
                           "World: High income",
                           "World: Low income",
                           "World: Lower-middle income",
                           "World: Lower-middle income excluding India",
                           "World: Upper-middle income",
                           "World: Upper-middle income excluding China")
WC_temp <- data_WelfareCoverage %>%
  filter(! ref_area.label %in% nonCountryDatepoints2,
         sex.label == "Total",
         classif1.label == "Contingency: Population covered by at least one social protection benefit",
         time >= 2020) %>%
  group_by(ref_area.label) %>%
  summarise(Welfare.Coverage = mean(obs_value)) %>%
  rename(country = ref_area.label) %>%
  mutate(country = recode(country,
                          "Côte d'Ivoire" = "Cote d'Ivoire",
                          "Bahamas" = "Bahamas, The",
                          "Congo, Democratic Republic of the" = "Congo, Dem. Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Bolivia (Plurinational State of)" = "Bolivia",
                          "Curaçao" = "Curacao",
                          "Venezuela (Bolivarian Republic of)" = "Venezuela, RB",
                          "United States of America" = "United States",
                          "United Kingdom of Great Britain and Northern Ireland" = "United Kingdom",
                          "Türkiye" = "Turkiye",
                          "Occupied Palestinian Territory" = "West Bank and Gaza",
                          "Macao, China" = "Macao SAR, China",
                          "Tanzania, United Republic of" = "Tanzania",
                          "Micronesia (Federated States of)" = "Micronesia, Fed. Sts.",
                          "Republic of Moldova" = "Moldova",
                          "Republic of Korea" = "Korea, Rep.",
                          "Lao People's Democratic Republic" = "Lao PDR",
                          "Iran (Islamic Republic of)" = "Iran, Islamic Rep.",
                          "Hong Kong, China" = "Hong Kong SAR, China",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Slovakia" = "Slovak Republic",
                          "Yemen" = "Yemen, Rep.",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)"))
data_WelfareCoverage_tidy <- WC_temp %>%
  add_row(country = "Channel Islands",
          # weighted average of Jersey and Guernsey
          # both populations are from World Population Review
          Welfare.Coverage = (WC_temp$Welfare.Coverage[WC_temp$country == "Jersey"] * 103989 +
                              WC_temp$Welfare.Coverage[WC_temp$country == "Guernsey"] *  6447) / 168466) %>%
  filter(! country %in% c("Jersey", "Guernsey"))
data_GenderPayGap_tidy <- data_GenderPayGap %>%
  filter(occupation == "_T",
         time_period >= 2020) %>%
  group_by(ref_area_desc) %>%
  summarise(Pay.Gap = mean(obs_value)) %>%
  rename(country = ref_area_desc) %>%
  mutate(country = recode(country,
                          "Côte d'Ivoire" = "Cote d'Ivoire",
                          "Bahamas" = "Bahamas, The",
                          "Congo, Democratic Republic of the" = "Congo, Dem. Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Bolivia (Plurinational State of)" = "Bolivia",
                          "Curaçao" = "Curacao",
                          "Venezuela (Bolivarian Republic of)" = "Venezuela, RB",
                          "United States of America" = "United States",
                          "United Kingdom of Great Britain and Northern Ireland" = "United Kingdom",
                          "Türkiye" = "Turkiye",
                          "Occupied Palestinian Territory" = "West Bank and Gaza",
                          "Macao, China" = "Macao SAR, China",
                          "Tanzania, United Republic of" = "Tanzania",
                          "Micronesia (Federated States of)" = "Micronesia, Fed. Sts.",
                          "Republic of Moldova" = "Moldova",
                          "Republic of Korea" = "Korea, Rep.",
                          "Lao People's Democratic Republic" = "Lao PDR",
                          "Iran (Islamic Republic of)" = "Iran, Islamic Rep.",
                          "Hong Kong, China" = "Hong Kong SAR, China",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Slovakia" = "Slovak Republic",
                          "Yemen" = "Yemen, Rep.",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)"))
data_GenderEquality_tidy <- data_GenderEquality %>%
  select(flagCode, country, GenderEquality_GlobalEqualityScore_score_2024) %>%
  mutate(country = recode(country,
                          "DR Congo" = "Congo, Dem. Rep.",
                          "Republic of the Congo" = "Congo, Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Syria" = "Syrian Arab Republic",
                          "Palestine" = "West Bank and Gaza",
                          "Ivory Coast" = "Cote d'Ivoire",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Cape Verde" = "Cabo Verde",
                          "Laos" = "Lao PDR",
                          "Iran" = "Iran, Islamic Rep.",
                          "Bahamas" = "Bahamas, The",
                          "Brunei" = "Brunei Darussalam",
                          "South Korea" = "Korea, Rep.",
                          "Vietnam" = "Viet Nam",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Sint Maarten" = "Sint Maarten (Dutch part)",
                          "Turkey" = "Turkiye",
                          "Czech Republic" = "Czechia",
                          "Macau" = "Macao SAR, China",
                          "Russia" = "Russian Federation",
                          "Hong Kong" = "Hong Kong SAR, China",
                          "Slovakia" = "Slovak Republic",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Micronesia" = "Micronesia, Fed. Sts.",
                          "North Korea" = "Korea, Dem. People's Rep.",
                          "Venezuela" = "Venezuela, RB",
                          "Yemen" = "Yemen, Rep.",
                          "Saint Martin" = "St. Martin (French part)"))
data_MaternityLeave_tidy <- data_MaternityLeave %>%
  select(country, MaternityLeave_LengthInWeeks_numOfWeeks_YearFree) %>%
  rename(Maternity.Leave = MaternityLeave_LengthInWeeks_numOfWeeks_YearFree) %>%
  mutate(country = recode(country,
                          "DR Congo" = "Congo, Dem. Rep.",
                          "Republic of the Congo" = "Congo, Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Syria" = "Syrian Arab Republic",
                          "Palestine" = "West Bank and Gaza",
                          "Ivory Coast" = "Cote d'Ivoire",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Cape Verde" = "Cabo Verde",
                          "Laos" = "Lao PDR",
                          "Iran" = "Iran, Islamic Rep.",
                          "Bahamas" = "Bahamas, The",
                          "Brunei" = "Brunei Darussalam",
                          "South Korea" = "Korea, Rep.",
                          "Vietnam" = "Viet Nam",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Sint Maarten" = "Sint Maarten (Dutch part)",
                          "Turkey" = "Turkiye",
                          "Czech Republic" = "Czechia",
                          "Macau" = "Macao SAR, China",
                          "Russia" = "Russian Federation",
                          "Hong Kong" = "Hong Kong SAR, China",
                          "Slovakia" = "Slovak Republic",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Micronesia" = "Micronesia, Fed. Sts.",
                          "North Korea" = "Korea, Dem. People's Rep.",
                          "Venezuela" = "Venezuela, RB",
                          "Yemen" = "Yemen, Rep.",
                          "Saint Martin" = "St. Martin (French part)")) %>%
  mutate(Maternity.Leave.Days = Maternity.Leave * 7) %>%
  select(-Maternity.Leave)
data_PaternityLeave_tidy <- data_PaternityLeave %>%
  select(country, PaternityLeave_DaysOfPaternityLeave_num_YearFree) %>%
  rename(Paternity.Leave.Days = PaternityLeave_DaysOfPaternityLeave_num_YearFree) %>%
  mutate(country = recode(country,
                          "DR Congo" = "Congo, Dem. Rep.",
                          "Republic of the Congo" = "Congo, Rep.",
                          "Egypt" = "Egypt, Arab Rep.",
                          "Gambia" = "Gambia, The",
                          "Syria" = "Syrian Arab Republic",
                          "Palestine" = "West Bank and Gaza",
                          "Ivory Coast" = "Cote d'Ivoire",
                          "Kyrgyzstan" = "Kyrgyz Republic",
                          "Cape Verde" = "Cabo Verde",
                          "Laos" = "Lao PDR",
                          "Iran" = "Iran, Islamic Rep.",
                          "Bahamas" = "Bahamas, The",
                          "Brunei" = "Brunei Darussalam",
                          "South Korea" = "Korea, Rep.",
                          "Vietnam" = "Viet Nam",
                          "Saint Kitts and Nevis" = "St. Kitts and Nevis",
                          "Sint Maarten" = "Sint Maarten (Dutch part)",
                          "Turkey" = "Turkiye",
                          "Czech Republic" = "Czechia",
                          "Macau" = "Macao SAR, China",
                          "Russia" = "Russian Federation",
                          "Hong Kong" = "Hong Kong SAR, China",
                          "Slovakia" = "Slovak Republic",
                          "United States Virgin Islands" = "Virgin Islands (U.S.)",
                          "Saint Vincent and the Grenadines" = "St. Vincent and the Grenadines",
                          "Saint Lucia" = "St. Lucia",
                          "Micronesia" = "Micronesia, Fed. Sts.",
                          "North Korea" = "Korea, Dem. People's Rep.",
                          "Venezuela" = "Venezuela, RB",
                          "Yemen" = "Yemen, Rep.",
                          "Saint Martin" = "St. Martin (French part)"))
data_ExpenditureonFamily_tidy <- data_ExpenditureonFamily %>%
  filter(REF_AREA != "OECD",
         TIME_PERIOD >= 2020) %>%
  group_by(REF_AREA) %>%
  summarise(Expeniture.on.Family = mean(OBS_VALUE)) %>% 
  left_join(data_Population_tidy %>% select(Country.Name, Country.Code),
            by = c("REF_AREA" = "Country.Code")) %>%
  drop_na()

data_ExpenditureonIncapacity_tidy <- data_ExpenditureonIncapacity %>%
  filter(REF_AREA != "OECD",
         TIME_PERIOD >= 2020) %>%
  group_by(REF_AREA) %>%
  summarise(Expeniture.on.Incapacity = mean(OBS_VALUE)) %>% 
  left_join(data_Population_tidy %>% select(Country.Name, Country.Code),
            by = c("REF_AREA" = "Country.Code")) %>%
  drop_na()

data_SocialExpenditure_tidy <- data_ExpenditureonFamily_tidy %>%
  left_join(data_ExpenditureonIncapacity_tidy %>% select(-Country.Name), by = c("REF_AREA")) %>%
  rename(Country.Code = REF_AREA) %>%
  mutate(Expeniture.on.Incapacity = replace_na(Expeniture.on.Incapacity, 0),
         Expeniture.on.Family = replace_na(Expeniture.on.Family, 0)) %>%
  mutate(Social.Expenditure = Expeniture.on.Incapacity + Expeniture.on.Family) %>%
  select(-Expeniture.on.Family, -Expeniture.on.Incapacity)

Only 42 countries are represented in these datasets (second one only 38 after removing NAs and those without any readings since 2020). Actually, let’s completely give up on these tables. Because we have the welfare coverage. And expenditure on family may actually be more related to how important family is, and if the society is a traditional gender roles kind of family then it will mess up the measures.

Join data

We want to choose the countries with which we can predict the effect of different policies on Singapore. Hence, we try to identify factors which affect the effect of possible policies to boost Female LFPR.

Policies would broadly be either financial (increasing the returns for females to work, decreasing the opportunity cost of hiring females) or non-financial (decreasing the opportunity cost for females to work, increasing the opportunity cost of not hiring females). Below, I will list out the data we are using and how it is relevant to these factors.

Summary of data we are going to use

Demographic:

  • Female.LFPR: which signals a starting point. This can suggest how accepted female employment is at large (if wanna go quirky, we can do quick LMs and see which are the most correlated?).
  • Population: Countries with large and small popualtions engage in different kinds of policies and face different bureaucratic burdens.
  • Fertility.Rate: Singapore is struggling with its ageing population. Policies which target female LFPR must work in the context of the ageing workforce.
  • Tertiary.Education: suggests the kinds of jobs that are available to potential female workers, which may then affect the policies that are suited for boosting employment in those sectors.

Economic:

  • GDP.PC: Generally, female LFPR does depend on income level of a country. In very low income countries, female LFPR is high due to females working being necessary. Hence, female LFPR is unlikely to be increased through policy. In middle income countries, stigma around married women working is particularly high, as it signals the inaptitude of the man to make an adequate living.
  • Unemployment: economic capacity for more women to enter the workforce.

Cultural:

  • Welfare.Coverage: Whether people are used to welfare policies and a generally interventionist government.
  • Maternity.Leave.Total.W: suggests the degree of family care expected of female workers, but also
  • Gender.Equality: generally, higher gender equality means greater respect for women’s rights and abilities (not always, but usually). Hence, it should come with an increased accpetance for female employment.

Code

income_group <- read.csv("../data/metadata/Population/Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_2590.csv") %>%
  select(Country.Code, Region, IncomeGroup)
full_data <- data_FemaleLFPR_tidy %>%
  full_join(data_Population_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_FertilityRate_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_TertiaryEducation_tiday %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_GDPPerCapita_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  left_join(income_group, by = c("Country.Code")) %>%
  mutate(IncomeGroup = factor(IncomeGroup,
                              levels = c("Low income", "Lower middle income",
                                         "Upper middle income", "High income"),
                              labels = c(1, 2, 3, 4),
                              ordered = TRUE)) %>%
  mutate(IncomeGroup = as.numeric(IncomeGroup)) %>%
  # full_join(data_Sectors_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  # select(-Agri, -Manu, -Indus, -Sector.Similarity) %>%
  full_join(data_Unemployment_tidy %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_ServicesEmployment_tidy  %>% select(-Country.Code), by = c("Country.Name")) %>%
  full_join(data_WelfareCoverage_tidy, by = c("Country.Name" = "country")) %>%
  full_join(data_MaternityLeave_tidy, by = c("Country.Name" = "country")) %>%
  full_join(data_PaternityLeave_tidy, by = c("Country.Name" = "country")) %>%
  replace_na(list(Maternity.Leave.Days = 0,
                  Paternity.Leave.Days = 0)) %>%
  mutate(Maternity.Paternity.Leave.Diff = Maternity.Leave.Days - Paternity.Leave.Days) %>%
  select(-Maternity.Leave.Days, -Paternity.Leave.Days) %>%
  full_join(data_GenderEquality_tidy %>% select(-flagCode), by = c("Country.Name" = "country")) %>%
  rename(Gender.Equality = GenderEquality_GlobalEqualityScore_score_2024) %>%
  relocate(Country.Code, .after = Country.Name) %>%
  relocate(Region, .after = Country.Code) %>%
  mutate(across(Female.LFPR:Gender.Equality, scale))

Correlation plot

head(full_data)
## # A tibble: 6 × 14
##   Country.Name Country.Code Region                Female.LFPR[,1] Population[,1]
##   <chr>        <chr>        <chr>                           <dbl>          <dbl>
## 1 Afghanistan  AFG          South Asia                     -2.95         0.0310 
## 2 Albania      ALB          Europe & Central Asia           0.182       -0.241  
## 3 Algeria      DZA          Middle East & North …          -2.38         0.0642 
## 4 Angola       AGO          Sub-Saharan Africa              1.45        -0.00207
## 5 Argentina    ARG          Latin America & Cari…           0.145        0.0598 
## 6 Armenia      ARM          Europe & Central Asia           0.370       -0.240  
## # ℹ 9 more variables: Fertility.Rate <dbl[,1]>, Tertiary.Education <dbl[,1]>,
## #   GDP.PC <dbl[,1]>, IncomeGroup <dbl[,1]>, Unemployment <dbl[,1]>,
## #   Services.Employment <dbl[,1]>, Welfare.Coverage <dbl[,1]>,
## #   Maternity.Paternity.Leave.Diff <dbl[,1]>, Gender.Equality <dbl[,1]>
cormat <- round(cor(full_data %>%
                      select(Female.LFPR:Gender.Equality, -IncomeGroup) %>%
                      drop_na()), 2)
melted_cormat_full <- melt(cormat)
get_lower_tri<-function(cormat) {
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
}

get_upper_tri <- function(cormat) {
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
}

upper_tri <- get_upper_tri(cormat)
melted_cormat <- melt(upper_tri, na.rm = TRUE)

ggplot(data = melted_cormat, aes(Var2, Var1, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "dodgerblue", high = "tomato", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 2.5) +
  labs(title = "Correlation between variables") +
  theme_minimal() + 
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 10, colour = "black"),
        axis.text.y = element_text(size = 10, colour = "black"),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        legend.position = c(.55, .95),
        legend.justification = c("right", "top"),
        legend.direction = "horizontal") +
        guides(fill = guide_colorbar(barwidth = 6, barheight = 1,
                                     title.position = "top", title.hjust = 0.5)) +
  
  coord_fixed()
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

par(mfrow=c(3, 3))

pivoted_table <- full_data %>%
  select(-IncomeGroup) %>%
  pivot_longer(Population:Gender.Equality, names_to = "Measure", values_to = "x")

ggplot(pivoted_table, aes(x = x, y = Female.LFPR)) + 
  geom_point(aes(colour = Region), size = 0.5, alpha = 0.7) + 
  geom_text_repel(aes(label = Country.Code), colour = "black", size=2) + 
  facet_wrap(~Measure) +
  theme_minimal() +
  theme(plot.background = element_rect(colour = "black", linewidth = 1)) +
  scale_color_brewer(palette = "Set1")
## Warning: Removed 464 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 464 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 184 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 183 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 135 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 176 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 185 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 184 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 145 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 186 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 164 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Trying to calculate the distance

Using all the data

cov_mat <- var(full_data %>% select(-Country.Name, -Country.Code, -Region, -IncomeGroup)  %>% drop_na(), na.rm=TRUE)
data_mat <- data.matrix(full_data %>% select(-Country.Name, -Country.Code, -Region, -IncomeGroup)  %>% drop_na())
centre <- data.matrix(
  full_data  %>% select(-IncomeGroup) %>% drop_na() %>%
    filter(Country.Name == "Singapore") %>%
    select(-Country.Name, -Country.Code, -Region)
  )
m_distances <- mahalanobis(data_mat, centre, cov_mat, FALSE)
eucl_distances <- dist(data_mat)[(full_data %>% select(-IncomeGroup) %>% drop_na())$Country.Name=="Singapore"]
m_distances
##   [1] 27.620240 29.701351 31.542062 37.444222 26.269301 15.896329 10.146802
##   [8] 22.640264 13.004247 25.437918 14.364143 29.433331 33.459308 29.334119
##  [15] 22.272403 41.015953 28.095130  9.367683 43.913156 26.930948 33.898638
##  [22] 29.567077 25.214967 13.964853 33.283812 26.142470 72.883900 25.683435
##  [29] 32.799695 24.285367 19.920807 20.386821 16.227415 10.969439 25.168609
##  [36] 28.721271 27.102218 31.640234 18.147954 25.526976 22.814734 16.121994
##  [43] 26.835732 14.845850 24.365300 47.650874 27.865065 22.689982 19.491508
##  [50] 32.867749 74.131996 23.280159 36.322091  5.116855 23.550754 13.286236
##  [57] 21.608483 40.968659 25.320646 18.286335 22.908154 27.929269 27.730708
##  [64] 21.325499 17.345018 16.886056 29.392207 20.306932 27.349885 14.966199
##  [71] 25.317955 28.401142 31.932869 27.751103 24.953389 27.368575 34.779321
##  [78] 32.530192 11.295741 22.231831 39.331100 22.934466 27.803792 18.221852
##  [85] 25.692535 16.909264 35.692245 17.939227 20.959382 16.924832 16.445287
##  [92] 27.828524 16.281439 32.305872 24.969094  0.000000 25.294531 16.395095
##  [99] 56.113647 20.392075 25.055230 30.663561 14.993569  7.945078 25.689453
## [106] 28.596239 26.863530 29.998329 30.939925 26.468576 26.388715 19.448250
## [113] 24.663554 11.084884 23.941565 29.952899 27.098614 25.102389
full_data_with_d <- cbind(full_data %>% drop_na(), m_distances)
ggplot(full_data_with_d %>% arrange(m_distances), aes(y = reorder(Country.Name, -m_distances, sum))) +
  geom_hline(yintercept = "Netherlands", lty = 2, alpha = 0.8) +
  geom_point(aes(x = scale(m_distances)), color = "deepskyblue") +
  labs(title = "Mahalanobis Similarity to Singapore",
         x = "Mahalanobis distance", y = "Country") +
  theme_minimal()

ggplot(full_data_with_d %>% slice_min(m_distances, n = 20), aes(x = m_distances)) +
  geom_hline(yintercept = "Netherlands", lty = 2, alpha = 0.8) +
  geom_point(aes(y = reorder(Country.Name, -m_distances, sum)), color = "dodgerblue", size = 2) +
  labs(title = "Similarity to Singapore (Top 20)",
       subtitle = "Mahalanobis distance from Singapore",
       x = "Mahalanobis distance", y = "Country") +
  theme_minimal() + 
  theme()

create_mdist_df <- function(df) {
  cov_mat <- var(df  %>% drop_na() %>% select(-Country.Name, -Country.Code, -Region), na.rm=TRUE)
  data_mat <- data.matrix(df %>% drop_na() %>% select(-Country.Name, -Country.Code, -Region))
  centre <- data.matrix(
    df  %>% drop_na() %>%
      filter(Country.Name == "Singapore") %>%
      select(-Country.Name, -Country.Code, -Region)
  )
  m_dist <- mahalanobis(data_mat, centre, cov_mat, FALSE)
  return (cbind(df %>% drop_na(), m_dist))
}

plot_mdist_data <- function(df_with_mdist, cutoff) {
  ggplot(df_with_mdist, aes(y = reorder(Country.Name, -m_dist, sum))) +
    geom_hline(yintercept = cutoff, lty = 2, alpha = 0.8) +
    geom_point(aes(x = scale(m_dist)), color = "dodgerblue", size = 2) +
    labs(title = "Mahalanobis Similarity to Singapore",
           x = "Mahalanobis distance", y = "Country") +
    theme_minimal()
}

plot_mdist_data_top_20 <- function(df_with_mdist, cutoff) {
  ggplot(df_with_mdist %>% slice_min(m_dist, n = 20), aes(y = reorder(Country.Name, -m_dist, sum))) +
    geom_hline(yintercept = cutoff, lty = 2, alpha = 0.8) +
    geom_point(aes(x = scale(m_dist)), color = "dodgerblue", size = 2) +
    labs(title = "Similarity to Singapore (Top 20)",
         subtitle = "Mahalanobis distance from Singapore",
         x = "Mahalanobis distance", y = "Country") +
    theme_minimal()
}

plot_mdist_data_last_20 <- function(df_with_mdist, cutoff) {
  ggplot(df_with_mdist %>% slice_max(m_dist, n = 20), aes(y = reorder(Country.Name, -m_dist, sum))) +
    geom_hline(yintercept = cutoff, lty = 2, alpha = 0.8) +
    geom_point(aes(x = scale(m_dist)), color = "dodgerblue", size = 2) +
    labs(title = "Similarity to Singapore (Bottom 20)",
         subtitle = "Mahalanobis distance from Singapore",
         x = "Mahalanobis distance", y = "Country") +
    theme_minimal()
}

Selective variables (all countries)

df_2 <- full_data %>%
  select(Country.Name, Country.Code, Region,
         Female.LFPR, 
         Fertility.Rate, Tertiary.Education,
         GDP.PC, Unemployment, Services.Employment,
         Gender.Equality) %>%
  drop_na()
df_2_mdist <- create_mdist_df(df_2)
plot_mdist_data(df_2_mdist, "United Arab Emirates")

plot_mdist_data_top_20(df_2_mdist, "United Arab Emirates")

plot_mdist_data_last_20(df_2_mdist, "United Arab Emirates")

ggplot(full_data %>%
         mutate(Gender.Equality = Gender.Equality * sd(data_GenderEquality_tidy %>% pull(GenderEquality_GlobalEqualityScore_score_2024)) + mean(data_GenderEquality_tidy %>% pull(GenderEquality_GlobalEqualityScore_score_2024))) %>% drop_na(),
       aes(x = GDP.PC, y = Female.LFPR,
           colour = Gender.Equality)) + 
  geom_point(size=2, alpha = 0.8) + 
  scale_color_gradient2(low="black", mid="deepskyblue", high="skyblue", midpoint=0.75,
                        name="Gender Equality Index", space = "Lab") +
  geom_smooth(colour="tomato2", se=FALSE, method="loess", formula = 'y ~ x') +
  geom_text_repel(aes(label = Country.Code), colour = "black", size=2, max.overlaps = 20) + 
  labs(title="Relationship Between Female Labour Force Participation Rate and GDP per Capita",
       subtitle="and state of gender equality",
       x="GDP per Capita", y="FLFP Rate") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        legend.position = c(.9, .3),
        legend.justification = c("right", "top"),
        legend.direction = "horizontal") +
        guides(colour = guide_colorbar(barwidth = 10, barheight = 1,
                                     title.position = "top", title.hjust = 0.5))

Only Asia

df_3 <- full_data %>%
  filter(Region == "East Asia & Pacific") %>%
  select(-IncomeGroup) %>%
  drop_na()
df_3_mdist <- create_mdist_df(df_3)
plot_mdist_data(df_3_mdist, "Japan")

plot_mdist_data_top_20(df_3_mdist, "Japan")

plot_mdist_data_last_20(df_3_mdist, "Japan")

Only Asia with selective variables

df_4 <- full_data %>%
  filter(Region == "East Asia & Pacific") %>%
  select(Country.Name, Country.Code, Region,
         Female.LFPR, 
         Fertility.Rate, Tertiary.Education,
         GDP.PC, Unemployment, Services.Employment,
         Gender.Equality) %>%
  drop_na()
df_4_mdist <- create_mdist_df(df_4)
plot_mdist_data(df_4_mdist, "Malaysia")

plot_mdist_data_top_20(df_4_mdist, "Malaysia")

chosen_1 <- c("Ireland", "Switzerland", "Brunei Darussalam", "Austria", "Denmark", "United States", "Netherlands")
chosen_2 <- c("Ireland", "Qatar", "Switzerland", "United States", "Brunei Darussalam", "United Arab Emirates")
chosen_3 <- c("Indonesia", "Australia", "Viet Nam", "Lao PDR", "Thailand", "Japan")
chosen_4 <- c("Australia", "Korea, Rep.", "Thailand", "Indonesia", "Indonesia", "Viet Nam", "Brunei Darussalam", "Lao PDR", "Japan", "Malaysia")
chosen_combined <- unique(c(chosen_1, chosen_2, chosen_3, chosen_4))
full_data %>% filter(Country.Name == "Singapore")
## # A tibble: 1 × 14
##   Country.Name Country.Code Region              Female.LFPR[,1] Population[,1]
##   <chr>        <chr>        <chr>                         <dbl>          <dbl>
## 1 Singapore    SGP          East Asia & Pacific           0.754         -0.219
## # ℹ 9 more variables: Fertility.Rate <dbl[,1]>, Tertiary.Education <dbl[,1]>,
## #   GDP.PC <dbl[,1]>, IncomeGroup <dbl[,1]>, Unemployment <dbl[,1]>,
## #   Services.Employment <dbl[,1]>, Welfare.Coverage <dbl[,1]>,
## #   Maternity.Paternity.Leave.Diff <dbl[,1]>, Gender.Equality <dbl[,1]>

Driving factors of similarity

data_chosen_countries <- full_data_with_d %>%
  arrange(m_distances) %>%
  select(-IncomeGroup, -Country.Code, -Region) %>%
  mutate(Female.LFPR = abs(Female.LFPR - 0.7542501),
         Population = abs(Population + 0.2189579),
         Fertility.Rate = abs(Fertility.Rate + 1.166541),
         Tertiary.Education = abs(Tertiary.Education - 1.235351),
         GDP.PC = abs(GDP.PC - 3.825699),
         Unemployment = abs(Unemployment + 0.6693725),
         Services.Employment = abs(Services.Employment - 1.54907),
         Welfare.Coverage = abs(Welfare.Coverage - 1.388958),
         Maternity.Paternity.Leave.Diff = abs(Maternity.Paternity.Leave.Diff - 0.5738964),
         Gender.Equality = abs(Gender.Equality - 0.4557436)) %>%
  pivot_longer(Female.LFPR:Gender.Equality, names_to = "Variable", values_to = "Value")

## Global expanded

ggplot(data_chosen_countries %>% 
         filter(Country.Name %in% chosen_1 | Country.Name %in% (full_data_with_d %>% slice_max(m_distances, n = 20) %>% pull(Country.Name))),
       aes(x = factor(Country.Name, full_data_with_d %>% 
                        arrange(m_distances) %>% 
                        pull(Country.Name)), 
           y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4.75,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Countries and Singapore (Global)",
       subtitle = "on each variable from expanded list",
       x = "Country", y = "Variable") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% c(chosen_1)),
       aes(x = factor(Country.Name, chosen_1), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Selected \nCountries and Singapore (Global)",
       subtitle = "on each variable from expanded list",
       x = "Country", y = "Variable") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% (full_data_with_d %>% slice_max(m_distances, n = 20) %>% pull(Country.Name))),
       aes(x = factor(Country.Name, (full_data_with_d %>% slice_max(m_distances, n = 20) %>% pull(Country.Name))), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4.75,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Furthest Countries and Singapore (Global)",
       subtitle = "on each variable from expanded list",
       x = "Country", y = "Variable") +
  coord_fixed()

## Global selected

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% c(chosen_2)) %>%
         mutate(Value = replace(Value, Variable %in% c("Population", "Welfare.Coverage", "Maternity.Paternity.Leave.Diff"), Inf)),
       aes(x = factor(Country.Name, chosen_2), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference to Singapore",
       x = "Country") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% (df_2_mdist %>% slice_max(m_dist, n = 15) %>% pull(Country.Name))) %>%
         mutate(Value = replace(Value, Variable %in% c("Population", "Welfare.Coverage", "Maternity.Paternity.Leave.Diff"), Inf)),
       aes(x = factor(Country.Name, (df_2_mdist %>% slice_max(m_dist, n = 20) %>% pull(Country.Name))), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference to Singapore",
       x = "Country", y = "Variable") +
  coord_fixed()

## Regional expanded

ggplot(data_chosen_countries %>% 
         filter(Country.Name %in% (df_3_mdist %>% pull(Country.Name))),
       aes(x = factor(Country.Name, df_3_mdist %>% 
                        arrange(m_dist) %>% 
                        pull(Country.Name)), 
           y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4.75,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Countries and Singapore (Regional)",
       subtitle = "on each variable from expanded list",
       x = "Country", y = "Variable") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% c(chosen_3)), 
       aes(x = factor(Country.Name, chosen_3), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Selected\n Countries and Singapore (Regional)",
       subtitle = "on each variable from expanded list",
       x = "Country") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% (df_3_mdist %>% slice_max(m_dist, n = 10) %>% pull(Country.Name))),
       aes(x = factor(Country.Name, (df_3_mdist %>% slice_max(m_dist, n = 10) %>% pull(Country.Name))), y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Furthest Countries and Singapore (Regional)",
       subtitle = "on each variable from expanded list",
       x = "Country", y = "Variable") +
  coord_fixed()

## Regional selected

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% (df_4_mdist %>% slice_min(m_dist, n = 5) %>% pull(Country.Name))) %>%
         mutate(Value = replace(Value, Variable %in% c("Population", "Welfare.Coverage", "Maternity.Paternity.Leave.Diff"), Inf)),
       aes(x = factor(Country.Name, df_4_mdist %>% slice_min(m_dist, n = 5) %>% pull(Country.Name)),
           y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 4,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference Between Regional \nComparators and Singapore",
       subtitle = "on each variable",
       x = "Country") +
  coord_fixed()

ggplot(data_chosen_countries %>%
         filter(Country.Name %in% (df_4_mdist %>% slice_max(m_dist, n = 10) %>% pull(Country.Name))) %>%
         mutate(Value = replace(Value, Variable %in% c("Population", "Welfare.Coverage", "Maternity.Paternity.Leave.Diff"), Inf)),
       aes(x = factor(Country.Name, (df_4_mdist %>% slice_max(m_dist, n = 10) %>% pull(Country.Name))),
           y = Variable)) +
  geom_tile(aes(fill=Value), color = "white") +
  scale_fill_gradient2(low="tomato", mid = "white", high="white", midpoint = 3,
                       name = "Difference \n(scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 2.5) +
  labs(title = "Difference to Singapore",
       x = "Country", y = "Variable") +
  coord_fixed()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.